#Homework 3
# Read sections 1, 3 and 4.1 (Section 2 is optional). Write a short report that
# replicates Tables II and III (pg. 638-640). The data can be obtained from the
# package AER.

#DIRECTIONS FOR REPRODUCTION
#Install packages if needed
#The code can be run from top to bottom.  

#PACKAGES
# install.packages("mgcv", repos = "http://lib.stat.cmu.edu/R/CRAN/")
# install.packages("sandwich", repos = "http://lib.stat.cmu.edu/R/CRAN/")
# install.packages("lmtest", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(sandwich)
library(lmtest)
library(mgcv)
library(AER)
library(car)

#GLOBAL VARIABLES.  (Variables that must be initialized for all sections.)
data(HousePrices)
P = HousePrices$price
DRV = HousePrices$driveway
REC = HousePrices$recreation
FFIN = HousePrices$fullbase
GHW = HousePrices$gasheat
CA = HousePrices$aircon
GAR = HousePrices$garage
REG = HousePrices$prefer
LOT = HousePrices$lotsize
BDMS = HousePrices$bedrooms
FB = HousePrices$bathrooms
STY = HousePrices$stories

#LOOK AT DATA
fix(HousePrices)
View(HousePrices)

#REPLICATION SECTION
# 1. Conduct the ordinary least squares regressions and compare your results to
# those reported in the paper. (1 point)
bench.reg = lm(log(P)~DRV + REC + FFIN + GHW + CA + GAR + REG + 
							 	log(LOT)+log(BDMS) + log(FB) + log(STY))
summary(bench.reg)

bench2.reg = lm(log(P)~DRV + REC + FFIN + GHW + CA + GAR + REG + 
									log(LOT)+BDMS + FB + STY)
summary(bench2.reg)

#Answer: check variation


# Extension

# After attempting to reproduce these tables, write a short report telling
# the five stories of these data. Use the benchmark model in Table III as a
# starting point. Try to improve our understanding of the sample and population 
# conditional distribution of house prices both qualitatively and quantitatively
# beyond this benchmark model. Here's what I expect to be included in the
# extension part of the report:



# 1. Create histograms of the house prices, log house prices, lot size, number
# of bedrooms, number of full bathrooms, and number of stories, with a summary
# of what you're seeing. (1 point)

png(filename = "ExtQ1-hist.png", width = 3000, #adjust png width
		height = 3000, #height
		res = 300)

par(mfrow = c(3,2), #sets number of rows and columns
		cex = 1.3, #scales text and symbols
		mar = c(4,3,1,0),#mar and oma set the inner and out margins
		oma = c(0,0,0,0))

hist(P,
		 main = "House Prices",
		 breaks = "FD",
		 xlab = "")
hist(log(P) ,
		 main = "Log of House Prices", breaks = "FD", xlab = "")
hist(LOT, 
		 main = "Lot Size", breaks = "FD",
		 xlab = "")
hist(BDMS, 
		 main = "Number of Bedrooms",
		 breaks = "FD", xlab = "")
hist(FB, 
		 main = "Number of Full Bathrooms",
		 breaks = "FD", xlab = "")
hist(STY, 
		 main = "Number of Stories",
		 breaks = "FD", xlab = "")
dev.off()


# 2. Create tables (using the table() function) of the driveway, recreational
# room, full and finished basement, gas for hot water heating, central air
# conditioning, and preferred neighborhood dummy variables, with a summary of
# what you're seeing. (1 point)


table(DRV)
table(REC)
table(GHW)
table(CA)
table(REG)

#The table command gives the number of values per level.  

# 3. Comment on whether you think the Gauss-Markov assumptions are met for the
# benchmark model in Table III. Which assumptions are likely to be met or not
# met and why? (1 point)




# 4. Plot the jackknife residuals versus fitted values for the benchmark
# regression using house prices rather than log house prices. What do you see?
# Using the Box-Cox scheme, what value of lambda would you suggest for a
# transformation of the house prices? Is the log transform appropriate for the
# dependent variable? Why or why not? (1 point)


#Plot of jacknife residuals versus the fitted values
nologbench.reg = lm(P~DRV + REC + FFIN + GHW + CA + GAR + REG + 
							 	log(LOT)+log(BDMS) + log(FB) + log(STY))

png(filename = "ExtQ4-jackniferesids.png", width = 3000, #adjust png width
		height = 4000, #height
		res = 300)

plot(fitted(nologbench.reg),rstudent(nologbench.reg),
		 xlab = "Fitted House Price ($)",
		 ylab = "Jacknife Residuals")

dev.off()


#Box-Cox scheme: determines power transformation for 
#dependent variable

png(filename = "ExtQ4-bcox.png",width = 3000, height = 3000, res = 300)
par(cex = 1.3)
bc = boxCox(P~DRV + REC + FFIN + GHW + CA + GAR + REG + 
							log(LOT)+log(BDMS) + log(FB) + log(STY),
						family = "yjPower", plotit = TRUE)
dev.off()


lambda = bc$x[which.max(bc$y)]

curve(log(x),from = 0, to = 20000, xname = "x")
curve(x^(0.06), from = 0, to = 20000, xname = "x")
#shape more important because it will be scaled with parameter.  


# 5. Create a component plus residual plot for the untransformed lot size
# variable in the benchmark regression model, using log transformed house
# prices. What transformation does the component plus residual plot suggest?
# Compare this to the value of lambda that the Box-Tidwell function suggests. What
# transformation of lot size should we use (if any)? (1 point)

#regression with LOT untransformed
notrfLot.reg = lm(log(P)~DRV + REC + FFIN + GHW + CA + GAR + REG + 
							 	LOT+log(BDMS) + log(FB) + log(STY))
summary(notrfLot.reg)


png(filename = "ExtQ5-compplusresid.png", width = 3000, height = 3000, res = 300)
par(mfrow = c(2, 2), cex = 1.3, mar = c(4, 4, 1, 1))

plot(LOT, log(P),
		 pch = 19,
		 col = rgb(0, 0, 0, .5))
# Plot the residual plus component against x
plot(LOT, resid(notrfLot.reg) + coef(notrfLot.reg)[9]*LOT,
		 pch = 19,
		 col = rgb(0, 0, 0, .5),
		 ylab = "Component + Residual")
plot(LOT, resid(notrfLot.reg),
		 pch = 19,
		 col = rgb(0, 0, 0, .5),
		 ylab = "Residual")
# Plot the component versus x
plot(LOT, coef(notrfLot.reg)[9]*LOT,
		 pch = 19,
		 col = rgb(0, 0, 0, .5),
		 ylab = "Component")
dev.off()


#Box-Tidwell
boxTidwell(log(P)~LOT, ~DRV + REC + FFIN + GHW + CA + GAR
						+ REG +log(BDMS) + log(FB) + log(STY))

#had to remove log(Lot)  is this okay? output....
# Score Statistic  p-value MLE of lambda
# -3.981082 6.86e-05    -0.2989201
# 
# iterations =  3

#suggests doing LOT^(-0.3)

curve(log(x),from = 0, to = 20000, xname = "x")
curve(x^(-0.3), from = 0, to = 20000, xname = "x")
curve(x^(0.06), from = 0, to = 20000, xname = "x")



# 6. Create a semi-parametric Generalized Additive Model (GAM) as an extension
# of the benchmark model with log transformed house prices. Use smoothing only
# for the lot size variable. Look at the partial residual plot of the lot size
# variable from the GAM. What transformation to the lot size variable looks
# appropriate? How does this compare to your conclusion from the component plus
# residual plot and Box-Tidwell transformation? (1 point)

gam1 <- gam(log(P)~DRV + REC + FFIN + GHW + CA + GAR + REG + 
							s(LOT)+log(BDMS) + log(FB) + log(STY))


png(filename = "ExtQ6-gam1.png", width = 3000, height = 3000, res = 300)
par(cex = 1.3)
plot(gam1,
		 residuals = TRUE, # Include the partial residuals
		 shade = TRUE, # Include shaded confidence bands
		 pages = 1,
		 scale = 0,
		 cex = 3)
dev.off()


# 7. For the forecasting story, compare the semi-parametric GAM used in the
# previous section against the benchmark model with whatever you think is the
# most appropriate transformation for the lot size variable (based on what
# you've learned from the component plus residual plot, Box-Tidwell transform,
# and partial residual plot from the GAM). Does your model or the
# semi-parametric model perform better in terms of cross-validation? Summarize
# what you think this means. (1 point)

#add cell id to HousePrices
HousePrices$id <- 1:length(HousePrices$price)
set.seed(10)
rMSEavg1 = c()
rMSEavg2 = c()
rMSEavg3 = c()

for(i in 1:1000){ # Loop i from 1 to 1000
	
	train.id <- sample(HousePrices$id, # Sample the cell ids
										 4*length(HousePrices$id)/5, # Sample 2/3 of the data
										 replace = FALSE) # Sample without replacement
	# Create "test" data that are the cell ids not selected
	test.id <- HousePrices$id[ - train.id]
	
	#Redefine variable	
	tst.P = HousePrices[HousePrices$id %in% test.id,]$price
	tst.DRV = HousePrices[HousePrices$id %in% test.id,]$driveway
	tst.REC = HousePrices[HousePrices$id %in% test.id,]$recreation
	tst.FFIN = HousePrices[HousePrices$id %in% test.id,]$fullbase
	tst.GHW = HousePrices[HousePrices$id %in% test.id,]$gasheat
	tst.CA = HousePrices[HousePrices$id %in% test.id,]$aircon
	tst.GAR = HousePrices[HousePrices$id %in% test.id,]$garage
	tst.REG = HousePrices[HousePrices$id %in% test.id,]$prefer
	tst.LOT = HousePrices[HousePrices$id %in% test.id,]$lotsize
	tst.BDMS = HousePrices[HousePrices$id %in% test.id,]$bedrooms
	tst.FB = HousePrices[HousePrices$id %in% test.id,]$bathrooms
	tst.STY = HousePrices[HousePrices$id %in% test.id,]$stories
	
	#TRAINING 

	#Benchmark
	train1 = lm(log(price)~driveway + recreation + fullbase + gasheat + aircon + garage +
								prefer + log(lotsize) + log(bedrooms) + log(bathrooms) + log(stories),
							data = HousePrices[HousePrices$id %in% train.id,])
	
	#Box-Tidwell Results: 
	train2 = lm(log(price)~driveway + recreation + fullbase + gasheat + aircon + garage +
								prefer + I(lotsize^(-0.3)) + log(bedrooms) + log(bathrooms) + log(stories),
							data = HousePrices[HousePrices$id %in% train.id,])
	
	#GAM model
	train3 = gam(log(price)~driveway + recreation + fullbase + gasheat + aircon + garage +
								prefer + s(lotsize) + log(bedrooms) + log(bathrooms) + log(stories),
							data = HousePrices[HousePrices$id %in% train.id,])
	
		
	#TESTING
	test1 <- (log(tst.P)-
							predict(train1, HousePrices[HousePrices$id %in% test.id,]))^2

	test2 <- (log(tst.P)-
							predict(train2, HousePrices[HousePrices$id %in% test.id,]))^2
	
	test3 <- (log(tst.P)-
							predict(train3, HousePrices[HousePrices$id %in% test.id,]))^2
	
	rMSEtest1 <- sqrt(sum(test1)/length(test1))
	rMSEtest2 <- sqrt(sum(test2)/length(test2))
	rMSEtest3 <- sqrt(sum(test3)/length(test3))	
	
	rMSEavg1 <- append(rMSEavg1, rMSEtest1)
	rMSEavg2 <- append(rMSEavg2, rMSEtest2)
	rMSEavg3 <- append(rMSEavg3, rMSEtest3)
}

summary(rMSEavg1)
summary(rMSEavg2)
summary(rMSEavg3)
	
	
# 8. Compare heteroskedasticity robust standard error estimates to "classical"
# standard error estimates for the benchmark model in Table III with log
# transformed house prices. Is there any difference in the standard errors?
# Comment on what this says about potential model misspecification. (1 point)
summary(bench.reg)
coeftest(bench.reg,
				 vcov = vcovHC(bench.reg, type = "HC0"),
				 df = df.residual(bench.reg))



# 9. Briefly comment on the statistical inference and causality stories. (1
# point)